# Produces most of the substantive analysis in the Comparative Politics paper
# This file includes small corrections made on July 27, 2020, to match final proofs of published paper
#
# Requires two data files
# 1) The BLS8 unified dataset, available in https://dataverse.harvard.edu/dataverse/bls
# 2) A file (bls8_estimates_parties_long.RData) with the estimates of the position of parties in Brazil. 
#    This file is only needed for a single figure, at the end of this code.
#    Download from the paper's dataverse 
#
# Produces the following output:
# fig-dimensionalityallissues.png -> scree plots and analytics for dimensionality analysis of each issue set. 
# fig-densityeachissue.png  -> density plot for scores on each issue set for three seleted parties
# fig-corrgram-issues&ideo.png -> shows linear correlation coefficients across all sets of issues and ideology estimates
# tab-issuesetcorrelations.csv -> table showing the same correlations as above by BLS wave
# tab-optimalparties.csv -> table showing the two "optimal clusters" of legislators, by party
# fig-bipartysystempartyideoshareright.png -> plots the share of legislators classified in the "right" cluster with the estimated party position
# fig-bipartysystem.png ->  boxplot of the positions of legislators in either cluser by issue set
# fig-bipartysystemdensity-grayscale.png -> density plot of ideological positions of the two clusters

rm(list=ls(all=TRUE))
library(car)
library(corrgram)
library(psych)
library(foreign)
library(gtools)
library(xtable)
library(reshape)
library(plyr)
library(Amelia)

setwd("~/Dropbox/DATA/Paper-BLS8") #Change here to your local folder
save.dir <- "~/Dropbox/LatexFiles/Paper-BLS8/Figures-analysis" #Change here to where you want to save the output

#### DECLARE FUNCTIONS USED IN CODE #############################
nsnr <- function(x){#-999 with NA
			out <- ifelse(x==-999,NA,x)
			return(out)}

invert <- function(x){#REVERSE THE POLARITY OF ANSWERS
	low <- min(x,na.rm=T)
	high <- max(x,na.rm=T)
	print(table(x))
	out <- abs(x-low-high)
	print(table(out))
	return(out)
}

shrink <- function(x){
	xx <- car::recode(x,"c(1,2)=1;c(3,3.25,4)=2;c(5,5.5,6)=3;c(7,7.75,8)=4;c(9,10)=5")
	return(xx)
}
################# END FUNCTIONS #############################


##########################################################################################
##### Preparing  BLS8 Data
########################################################################################## 

#Read the datafile that is deposited in the BLS8 dataverse #
#Adapt the "load" line so that it reads the correct file locally saved #
load("~/Dropbox/Data/BLS8/Data-Preparation/BLS8_Released_20200416.RData") 
d <- bls
names(d) <- toupper(names(d))

## Pre-processing the data ###########################################################

# Fixing polarity (high values should be right wing)
d$COTAAFRO<-invert(nsnr(d$COTAAFRO)) #originally high is left
d$COTRENDA<-invert(nsnr(d$COTRENDA)) #originally high is right
d$CASAMENT<-invert(nsnr(d$CASAMENT)) #originally high is left
d$COTAMULHERES<-invert(nsnr(d$COTAMULHERES)) #originally high is left
d$ABORTO<-nsnr(d$ABORTO)#originally high is right
d$RENDA<-shrink(nsnr(d$RENDA)) #originally high is right
d$INICPRIV<-shrink(invert(nsnr(d$INICPRIV))) #originally high is left
d$GOVRESP<-shrink(nsnr(d$GOVRESP)) #originally high is right
d$CONCORR<-shrink(invert(nsnr(d$CONCORR))) #originally high is left
d$TRABVIDA<-shrink(invert(nsnr(d$TRABVIDA))) #originally high is left
d$GETRICH<-shrink(nsnr(d$GETRICH)) #originally high is right
d$ECONLMR<-invert(nsnr(d$ECONLMR)) #originally high is left
d$ENVIRO<-nsnr(d$ENVIRO)	
d$LRCLASS <- nsnr(d$LRCLASS)
d$FISCAL <- invert(nsnr(d$FISCAL))  #originaly, high values is left
d$SAUDE  <-invert(nsnr(d$SAUDE))  #originally, high values are left
d$EDFUND  <-invert(nsnr(d$EDFUND))  #originally, high values are left
d$EDSUPER   <-invert(nsnr(d$EDSUPER))  #originally, high values are left
d$INFRA <-invert(nsnr(d$INFRA))  #originally, high values are left
d$SOCPOL  <-invert(nsnr(d$SOCPOL))  #originally, high values are left 
d$FFAAINT <- nsnr(d$FFAAINT) #not included in paper due to (a by then) not identified error. Include in the future. 
d$FPOECD2 <-  nsnr(d$FPOECD2)   
d$CHINA <-	nsnr(d$CHINA)	
d$BNDES <- 	nsnr(d$BNDES)		
d$BRFDI <-	nsnr(d$BRFDI)			
d$PCFISCAL<-nsnr(d$PCFISCAL)
d$PCINTERV<-nsnr(d$PCINTERV)
d$PCREDIST<-nsnr(d$PCREDIST)
d$PCVALUES<-nsnr(d$PCVALUES)

## Additional pre-processing of the data ##
d$dFISCAL <- d$dNEW <- d$dTRAD <- d$dLIB <- d$dINT <- NA
the.items <- c('SAUDE','EDFUND','EDSUPER','INFRA','SOCPOL',
				'GETRICH','CONCORR','INICPRIV','TRABVIDA',
				'ECONLMR','GOVRESP','RENDA','COTRENDA',
				'COTAAFRO','ABORTO','CASAMENT',
				'FPOECD2','CHINA','BNDES','BRFDI')
dd <- subset(d,WAVE==2013|WAVE==2017,select=the.items)
ideo <- subset(d,WAVE==2013|WAVE==2017,select=CZIDEO)
the.party <- subset(d,WAVE==2013|WAVE==2017,select=PARTY_SURVEY)
the.uniqueID <-  subset(d,WAVE==2013|WAVE==2017,select= UNIQUEID)
the.wave<-  subset(d,WAVE==2013|WAVE==2017,select= WAVE)
cat("Waves 7 and 8 contain",nrow(dd),"observations\n")
cat("There are",length(which(table(the.uniqueID)>1)),"repeat legislators\n")
the.repeats <- labels(which(table(the.uniqueID)>1))
the.uniqueID$repeats <- is.element(the.uniqueID$UNIQUEID,as.numeric(the.repeats))
the.uniqueID$NAs <- apply(is.na(dd),1,sum)

## Eliminating cases with too many missing repsonses ##
  #Check missing data pattern
  cat("Number of missing observations (in the 20 items)\n")
	table(apply(is.na(dd),1,sum))
	#Closer look at those with 3, 5 and 6 missing responses
	#m3 <- which(apply(is.na(dd),1,sum)==3)
	#print(dd[m3,]) #case 1267 should be dropped (intl completely missing)
	#m5 <- which(apply(is.na(dd),1,sum)==5)
	#print(dd[m5,]) #case 1032 can remain (no set is completely missing)
	#m6 <- which(apply(is.na(dd),1,sum)==6)
	#print(dd[m6,]) #case 1085 can remain (no set is completely missing)
	
	#Eliminate all with 5 or more missing overall 
	#As these have complete missingness in at least one "issue set"
	#Except for  cases manually flagged above
	N.missing <- apply(is.na(dd),1,sum)
	#Dropping cases with 5 or more missing (and exceptions, below) produces sample of 271, in paper
	to.drop <- N.missing >=5 
  	to.drop["1032"]<-F #exception (keep)
  	to.drop["1085"]<-F #exception (keep)
	  to.drop["1267"]<-T #exception (drop)
	
dd <- dd[-which(to.drop),]
ideo <- ideo[-which(to.drop),]
the.party <- the.party[-which(to.drop),] 
the.uniqueID <- the.uniqueID[-which(to.drop),]
the.wave <- the.wave[-which(to.drop),]
cat("Keeping",nrow(dd),"observations\n")
cat("There are",length(which(table(the.uniqueID$UNIQUEID)>1)),"repeat legislators \n")
the.repeats <- labels(which(table(the.uniqueID$UNIQUEID)>1))
the.uniqueID$usedrepeats <- is.element(the.uniqueID$UNIQUEID,as.numeric(the.repeats))
the.uniqueID$wave <- the.wave

#Imputation of remaining missing reponses (using Amelia)
set.seed(1977)
ddi <- amelia(cbind(dd), m = 1, idvars = NULL, ords=1:ncol(dd),
		bounds=cbind(1:ncol(dd),rep(1,ncol(dd)),apply(dd,2,max,na.rm=T)))	
ddi <- ddi$imputations[[1]]
dd[rownames(ddi),]	<- ddi #plug the imputed values back in the data
					

## ANALYSIS ###########################################################

## Create the ex-ante issue sets
qfiscal <- subset(dd,select=c(SAUDE,EDFUND,EDSUPER,INFRA,SOCPOL))
		pcfiscal <- psych::principal(qfiscal, nfactors=1)

qlib <- subset(dd,select=c(GETRICH,CONCORR,INICPRIV,TRABVIDA))
		pclib <- psych::principal(qlib)

qtrad <- subset(dd,select=c(ECONLMR,GOVRESP,RENDA,COTRENDA))
		pctrad <- psych::principal(qtrad )

qnew <- subset(dd,select=c(ABORTO,CASAMENT,COTAAFRO))
		pcnew <- psych::principal(qnew, nfactors=1)
		
qint <- subset(dd,select=c(FPOECD2,CHINA,BNDES,BRFDI))	
		pcint <- psych::principal(qint, nfactors=1)

# Generate figure for Parallel Analysis of all issue sets 
png(file=paste(save.dir,"/fig-dimensionalityallissues.png",sep="")
			,res=600,width=6,height=6,units="in")
par(mfrow=c(2,3))
fa.parallel(qlib,main='Liberalism',show.legend=F,ylab="Eigenvalues")	
fa.parallel(qnew,main='New',show.legend=F,ylab="Eigenvalues")	
fa.parallel(qtrad,main='Traditional',show.legend=F,ylab="Eigenvalues")	
fa.parallel(qfiscal,main='Fiscal',show.legend=F,ylab="Eigenvalues")	
fa.parallel(qint,main='International',show.legend=F,ylab="Eigenvalues")	
dev.off()

# Save issue set scores do main dataframe
d$dTRAD[as.numeric(rownames(pctrad$scores))] <- pctrad$scores
d$dNEW[as.numeric(rownames(pcnew$scores))] <- pcnew$scores
d$dFISCAL[as.numeric(rownames(pcfiscal$scores))] <- pcfiscal$scores
d$dLIB[as.numeric(rownames(pclib$scores))] <- pclib$scores
d$dINT[as.numeric(rownames(pcint$scores))] <- pcint$scores

# Figure for scores on each issue set for selected parties #
plot.dim <- function(the.var,the.title){
  x.min <- min(d[,the.var],na.rm=T)
  x.max<- max(d[,the.var],na.rm=T)
  par(mar=c(2,2,2,2))
  plot(density(d[,the.var][which(d$PARTY_SURVEY==13)],na.rm=T),col=2,xlim=c(x.min,x.max),xlab="",main=the.title,yaxt="n",ylab="",bty="n")
  par(new=T)
  plot(density(d[,the.var][which(d$PARTY_SURVEY==45)],na.rm=T),col="blue",xlim=c(x.min,x.max),xlab="",main="",yaxt="n",ylab="",bty="n")
  par(new=T)
  plot(density(d[,the.var][which(d$PARTY_SURVEY==20)],na.rm=T),col="green",xlim=c(x.min,x.max),xlab="",main="",yaxt="n",bty="n")
}

png(file=paste(save.dir,"/fig-densityeachissue.png",sep="")
    ,res=600,width=6,height=6,units="in")
par(mfrow=c(2,3))
plot.dim("dTRAD","Traditional")
plot.dim("dNEW","New")
plot.dim("dFISCAL","Fiscal")
plot.dim("dLIB","Liberalism")
plot.dim("dINT","International")
plot(0,0,type="n",bty="n",xaxt="n",yaxt='n')
legend(x="center",legend=c("PT","PSDB","PSC"),col=c(2,"blue","green"),lty=1,lwd=2)
dev.off()

## Examine correlations between scores on issue sets and with ideology estimates
outW78 <- subset(d,select=c(CZIDEO,dTRAD,dFISCAL,dNEW,dLIB,dINT))
outW78$CZIDEO <- nsnr(outW78$CZIDEO)

png(file=paste(save.dir,"/fig-corrgram-issues&ideo.png",sep="")
		,res=600,width=6,height=6,units="in")
	par(mar=c(2,2,1,1),mfrow=c(1,1))
	names(outW78) <- c("Ideo","Trad","Fiscal","New","Liber","Int")
	tmp <- corrgram(outW78,lower.panel="panel.conf")
dev.off()

# Analysis of the dimensionlity of the (five) issue-set space
cor(outW78,use="pairwise.complete.obs")
quartz()
VSS(outW78[,-1])
fa.parallel(outW78[, -1])
psych::principal(outW78[,-1],nfactors=1)   

#Correlation table in Supplemental Materials (Appendix)
w7 <- cor(outW78[d$WAVE==2013,],use="pairwise.complete.obs")
w8 <- cor(outW78[d$WAVE==2017,],use="pairwise.complete.obs")
the.table <- round(cbind(upper.tri(w7)*w7,upper.tri(w8)*w8),2)
write.csv(the.table,file=paste(save.dir,"/tab-issuesetcorrelations.csv",sep=""))

# Create party dataframe with average positions and size for selected parties
N <- table(d$PARTY_SURVEY[d$WAVE==2017|d$WAVE==2013])
selected <- names(which(N>=10))
the.space <- principal(outW78[,-ncol(outW78)],nfactors=2) 
d$s1 <- d$s2 <- NA
d$s1[as.numeric(rownames(the.space$scores))]<- the.space$scores[,1]
d$s2[as.numeric(rownames(the.space$scores))]<- the.space$scores[,2]
outp <- merge(na.omit(data.frame(
  CZIDEO = as.matrix(by(d$CZIDEO,d$PARTY_SURVEY,median,na.rm=T)),
  s1 = as.matrix(by(d$s1,d$PARTY_SURVEY,median,na.rm=T)),
  s2 = as.matrix(by(d$s2,d$PARTY_SURVEY,median,na.rm=T)),
  dFISCAL = as.matrix(by(d$dFISCAL,d$PARTY_SURVEY,median,na.rm=T)),
  dNEW = as.matrix(by(d$dNEW,d$PARTY_SURVEY,median,na.rm=T)),
  dTRAD = as.matrix(by(d$dTRAD,d$PARTY_SURVEY,median,na.rm=T)),
  dLIB = as.matrix(by(d$dLIB,d$PARTY_SURVEY,median,na.rm=T)),
  dINT = as.matrix(by(d$dINT,d$PARTY_SURVEY,median,na.rm=T))))
	,data.frame( table(d$PARTY_SURVEY[d$WAVE==2017|d$WAVE==2013]))
	,by.x=0,by.y="Var1")
outp$PARTYNAME <- car::recode(outp[,1],"10='PRB';11='PDS';12='PDT';13='PT';14='PTB';15='PMDB';16='PSTU';17='PDC';
                       18='REDE';19='PODE';21='PCB';20='PSC';22='PR';23='PPS';25='PFL';26='PMB';	
                      					27='PSDC';28='PTR';29='PCO';31='PHS';33='PMN';36='PRN';
                       40='PSB';41='PSD';44='PRP';43='PV';45='PSDB';50='PSOL';55='PSD';52='PST';56='PRONA';
                       65='PCDOB';70='PSL';73='PSdoB';75='PMSD';77='SD';81='PNTB';84='PFS';90='PROS'")
outp <- subset(outp,Freq>=8)
names(outp)[1] <- "PARTY"

# Party space is clearly one-dimensional ##
pp <- subset(outp,select=c(dFISCAL,dTRAD,dNEW,dLIB,dINT))
fa.parallel(pp)
principal(pp,nfactors=1) 


## CLUSTERING OF LEGISLATORS ####################################################

### How do legislators cluster ?   #######
how <- "dim" #use this to base the clustering on the scores on the five issue sets
#how <- "issue" #use this to base the clustering on answers to the twenty questions
if(how=="dim"){
	w <- na.omit(outW78[,-1])}
if(how=="issue"){
 	w <- na.omit(dd)
 }
w <- scale(w) #scale
d$cloptimal <- d$clh <- NA

#Optimal medioid clustering (using PAM) 
pamall <- fpc::pamk(w,krange=2:10,critout=T)
#pamall <- fpc::pamk(w,krange=2:10,critout=T,criterion="ch") #alternative, yields identical classification
n.clusters <- pamall$nc  #optimal number of cluseters
clustering <- pamall$pamobject$clustering
d$cloptimal[as.numeric(names(clustering))]<- as.numeric(clustering)
#plot(pamall$pamobject)

# Ward Hierarchical Clustering
# Yields very similar results
whc <- dist(w, method = "euclidean") # distance matrix
fit <- hclust(whc, method="ward.D2") 
plot(fit,labels=the.party,cex=0.3) # display dendogram
groups <- cutree(fit, k=n.clusters) 
#rect.hclust(fit, k=2, border="red")
d$clh[as.numeric(names(groups))]<- as.numeric(groups)

#Two classifications are quite consistent 				
table(d$PARTY_SURVEY[d$WAVE>=2013],d$clh[d$WAVE>=2013])
prop.table(table(optimal=d$cloptimal,hierchical=d$clh)) 
cat(round(100*sum(diag(prop.table(table(d$cloptimal,d$clh)))),2),"% are equally classified in both methods\n",sep="")
error.class <- subset(d,clh!=cloptimal,select=c(PARTY_SURVEY,WAVE))
#sort(table(error.class$PARTY_SURVEY)) #PSDB and PMDB lead number of undifiened

# Create table
tabop <- addmargins(table(d$PARTY_SURVEY[d$WAVE>=2013]
				,d$cloptimal[d$WAVE>=2013]),1) #for paper
rownames(tabop) <- car::recode(rownames(tabop),"10='PRB';11='PP';12='PDT';13='PT';14='PTB';15='PMDB';16='PSTU';17='PDC';
                       18='REDE';19='PODE';21='PCB';20='PSC';22='PL';23='CID';25='DEM';26='PMB';	
                      					27='PSDC';28='PTR';29='PCO';31='PHS';33='PMN';36='PRN';
                       40='PSB';41='PSD';44='PRP';43='PV';45='PSDB';50='PSOL';55='PSD';52='PST';56='PRONA';
                       65='PCDOB';70='PSL';73='PSdoB';75='PMSD';77='SD';81='PNTB';84='PFS';90='PROS'")
                                   
tabop <- rbind(tabop,Share=round(tabop["Sum",]/sum(tabop["Sum",])*100))[,c(2,1)]#put "right" party on the right
tabopx <- xtable::xtable(tabop,digits=0)				
caption(tabopx) <- "Party Membership With Optimal Clustering"
label(tabopx) <- "tab:cluster"
print(tabopx,caption.placement="top")		
write.csv(tabop,file=paste(save.dir,"/tab-optimalparties.csv",sep=""))
	

#Compare share of left and right with party idelogical position ###
load('~/Dropbox/Data/Paper-BLS8/bls8_estimates_parties_long.RData')  #this file is also in Dataverse
pp <- subset(party.table,year==2017,select=c(party,ideo))
tabop <- data.frame(tabop)
tabop$share.right <- tabop[,1]/(tabop[,1]+tabop[,2])
pp <- merge(pp,tabop,by.x='party',by.y=0)
pp <- subset(pp,pp$X1+pp$X2>2)
png(file=paste(save.dir,"/fig-bipartysystempartyideoshareright.png",sep="")
	,res=600,width=10,height=6,units="in")
par(mar=c(4,4,1,1))
plot(pp$ideo,pp$share.right
	,bty='n',xlab="Estimate of the Party's Position"
	,ylab="Propotion of Members is Center-Right  Cluster"
	,xlim=c(-1,0.55),xpd=NULL)
text(pp$ideo,pp$share.right,pos=3,label=pp$party,xpd=NA,cex=0.8)
text(0.48,0,label=paste("r=",round(cor.test(pp$share.right,pp$ideo)$est
,2)))
abline(reg=lm(pp$share.right~pp$ideo),lty=2)
dev.off()

#How would they stand on each issue
png(file=paste(save.dir,"/fig-bipartysystem.png",sep="")
	,res=600,width=10,height=6,units="in")
par(mfrow=c(1,1),mar=c(3,3,1,1))
tmp <- na.omit(melt.data.frame(d
		,id.vars="cloptimal"
		,measure.vars=c("dLIB","dTRAD","dNEW","dFISCAL","dINT")))
tmp$cloptimal <- ifelse(tmp$cloptimal==2,1,2)#invert 1 and 2 for grpah
bb <- boxplot(value~cloptimal+variable,data=tmp
		#, border = c(gray(0),gray(0.5))
		, col = c(gray(0.7),0)
		, xaxt="n"
		, ylab="Issue Scores"
		, bty="n")
axis(side=1,at=seq(1.5,9.5,by=2)
	,labels=c("LIBERALISM","TRADITIONAL\nISSUES","NEW\nISSUES","FISCAL\nISSUES","INTERNATIONAL\nISSUES")
	, tick=F)
legend(x="topleft",legend=c("`Left' Cluster","'Right' Cluster")
		 , fill= c(gray(0.7),0)
		 , bty="n")
dev.off()


png(file=paste(save.dir,"/fig-bipartysystemdensity-grayscale.png",sep="")
    ,res=600,width=10,height=6,units="in")
tmp <- subset(d,select=c(cloptimal,CZIDEO))
tmp$CZIDEO[which(tmp$CZIDEO==-999)]<-NA
tmp1 <- na.omit(subset(tmp,cloptimal==1))
tmp2 <- na.omit(subset(tmp,cloptimal==2))
par(mar=c(4,4,1,1),fig=c(0,1,0,0.8),new=F)
plot(density(tmp1$CZIDEO)
     ,bty="n",ylim=c(0,1.5),xlim=c(-2.5,2.5)
     ,main="",xlab="Ideology (Individual Estimate)")
lines(density(tmp2$CZIDEO),col=gray(0.5))
legend(x="topleft",legend=c("'Left' Cluster","'Right' Cluster"),
       lty=1,col=c(gray(0.5),1),
       bty="n")
par(fig=c(0,1,0.65,1), new=TRUE)
boxplot(CZIDEO~cloptimal,data=tmp,horizontal=T,axes=F,border=c(1,gray(0.5)))
dev.off()		